

/*------------------------------------------------------------------------------
	1. Housekeeping
------------------------------------------------------------------------------*/
	*Run housekeeping code
		qui do ".../replication_package/housekeeping.do"

	*Write log
		log using "$log_loc/figure_5_panel_a.log", text replace

/*------------------------------------------------------------------------------
	2. Clean Crosswalk
------------------------------------------------------------------------------*/

			*Import zip_code-tract crosswalk
			import excel using "${data}/Zip-Census Tract/ZIP_TRACT_122018.xlsx", clear first

				*Make variables lower case
					ren *, lower
	
				*Keep variables needed for the crosswalk
					keep tract tot_ratio zip
	
				*Rename zip code
					ren zip zip_code

				*Drop tract-zip combinations where tract does not correspond to any zip code
					drop if tot_ratio == 0
	
				*Check id
					isid tract zip_code
	
			*Save dataset 
			tempfile zip_cross_2018
			save `zip_cross_2018', replace

/*------------------------------------------------------------------------------
	3. Estimate share of (kids age_numd 0-14)/(kids age_numd 0-17)
------------------------------------------------------------------------------*/

			import excel using "$data/Census Count/nc-est2019-syasexn.xlsx", clear cellra(A6:M23)
				keep A K L M
				ren (A K L M) (age_num census_2017 census_2018 census_2019)

			tempfile census_data
			save `census_data', replace

				replace age_num = substr(age_num, 2, .)
					destring age_num, replace
					sort age_num
	
				*Make data long (i.e., by age_num and year instead of just by age_num)
					reshape long census_, i(age_num) j(year)
					ren census_ census_ct
					isid year age_num

				*Keep if year is the one to be analyzed
					keep if year == 2018

				*Estimate the share of children under 14
					qui sum if age_num <= 14,d
					local ct_14_and_under = r(sum)

					qui sum if age_num <= 17, d
					local ct_17_and_under = r(sum)

					local share_14_and_under = `ct_14_and_under'/`ct_17_and_under'
					di "For 2018, there are `ct_14_and_under' children 14 and under and `ct_17_and_under' children 17 and under"
					di "This corresponds to a share of `share_14_and_under' children being 14 and under"

/*------------------------------------------------------------------------------
	4. Prepare Kids Per Tract Dataset
------------------------------------------------------------------------------*/		
		*Import Household Age Data
		import delimited using "$data/B09001/ACSDT5Y2018.B09001-Data.csv", clear rowrange(2) varn(2)

			*Keep key variables
				keep geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y

			*Rename variables
				ren (geography geographicareaname estimatetotal estimatetotalinhouseholds estimatetotalinhouseholdsunder3y estimatetotalinhouseholds3and4ye estimatetotalinhouseholds5years estimatetotalinhouseholds6to8yea estimatetotalinhouseholds9to11ye estimatetotalinhouseholds12to14y estimatetotalinhouseholds15to17y) (tract tract_name total in_hh age_num_less_than_3 age_num_3_to_4 age_num_5 age_num_6_to_8 age_num_9_to_11 age_num_12_to_14 age_num_15_to_17)

			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*Get ct from 0-14
				gen age_num_0_14_hh = age_num_less_than_3 + age_num_3_to_4 + age_num_5 + age_num_6_to_8 + age_num_9_to_11 + age_num_12_to_14
			
			/*Estimate the proportion of non-household kids that are 14 by multiplying the number of kids (0-17) 
			by the proportion of kids in Census that are 0-14*/ 
				gen in_group_quarters = total - in_hh
				gen age_num_0_14_gq = in_group_quarters * `share_14_and_under'

			*Estimate the total number of kids that are 0-14
				gen age_num_0_14_acs = age_num_0_14_gq + age_num_0_14_hh

			*Keep key variables
				keep tract age_num_0_14_acs 

		tempfile kids
		save `kids', replace
/*------------------------------------------------------------------------------
	5. Prepare Income-by-tract dataset
------------------------------------------------------------------------------*/

		*Import Income Data
		import delimited using "$data/B19001/ACSDT5Y2018.B19001-Data.csv", clear rowrange(2) varn(2)
	
			*Keep Key variables
				keep geography geographicareaname estimatetotal estimatetotallessthan10000 estimatetotal10000to14999 estimatetotal15000to19999 estimatetotal20000to24999 estimatetotal25000to29999

			*Rename variables
				ren (geography geographicareaname) (tract tract_name)

			*Convert variables to string. Will not effect all variables, but 2018 is already string, so want to harmonize datasets
				tostring estimatetotal estimatetotallessthan10000 estimatetotal10000to14999 estimatetotal15000to19999 estimatetotal20000to24999 estimatetotal25000to29999, replace

			*Destring variables
			foreach var of varlist estimatetotal estimatetotallessthan10000 estimatetotal10000to14999 estimatetotal15000to19999 estimatetotal20000to24999 estimatetotal25000to29999 {
				replace `var' = "" if `var' == "null"
				drop if `var' == ""
				destring `var', replace
			}

			*Generate variables
				gen total_hh = estimatetotal
				gen hh_inc_30k_or_less = estimatetotallessthan10000 + estimatetotal10000to14999 + estimatetotal15000to19999 + estimatetotal20000to24999 + estimatetotal25000to29999
				gen hh_inc_30k_or_more = estimatetotal - hh_inc_30k_or_less
	
			*Clean tract variable (last 11 values in tract variable)
				replace tract = substr(tract, -11, 11)

			*merge with kids dataset
				merge 1:1 tract using `kids'
					keep if _merge == 3
					drop _merge

			merge 1:m tract using `zip_cross_2018'
				keep if _merge == 3
				drop _merge

			/*Tot_ratio = ratio of addresses in tract-zip record to total number of addresses in tract
			To allocate data from tract to zip, multiple number of observations in tract by tot_ratio for each zip associated with tract */
			foreach i of varlist age_num_0_14_acs total_hh hh_inc_30k_or_more{
				replace `i' = `i' * tot_ratio
			}

			collapse (sum) age_num_0_14_acs total_hh hh_inc_30k_or_more, by(zip_code)

			*Generate pct race estimates by zip code
				gen pct_30k = 100 * hh_inc_30k_or_more/total_hh
	
		tempfile inc_data_2018
		save `inc_data_2018', replace

/*------------------------------------------------------------------------------
	6. Clean Main Datasets
------------------------------------------------------------------------------*/

		*Import Data
		use "$data/childrens2018.dta", clear

			/*Restrict Sample*/
				*Keep 1095 filings
					rename state_insurance state
					fmerge m:1 state using "$data/statelist.dta"
						assert _merge != 2 //Confirm that all 51 states are in dataset
						drop if _merge != 3

				*Keep if age_num is within the range
					keep if age_num >= 0 & age_num <= 16

				*Keep if kid died after end of year or is currently alive
					keep if 2018*10000+1231<death_date|death_date==0

				*Keep if months covered is possible
					keep if num_cov_ins > 0 & num_cov_ins <= 12

			/*Generate variables for figure*/		
				*Generate outcome variable on claiming
					gen onreturn = file_inc != .
	
				*Rename zip code variable
					ren (zipcode_ins) (zip_code)

				*Generate a count variable
					gen ct = 1

				*Collapse to the by-zipcode level		
					collapse (sum) ct onreturn, by(zip_code)

				*Adjust for claiming limits
					replace onreturn = onreturn * 1.0017029

		*Generate outcome (i.e., percent of children claimed)
			gen perc_return = onreturn/ct

			*Adjust outcome to be out of 100
				replace perc_return = perc_return*100
			
			*Merge with main dataset
				merge 1:1 zip_code using `inc_data_2018'
					keep if _merge == 3

/*------------------------------------------------------------------------------
	7. Create Figure 4
------------------------------------------------------------------------------*/
			/*Create figure*/		
				binscatter perc_return pct_30k [w = age_num_0_14_acs], nq(20) ///
				xtitle("% Households with Income > $30,000") ytitle("% Children Claimed") ///
				reportreg

				graph export "${output}/Percent Income Above 30K 2018.png", replace as(png)

	